To load desired packages
library(reshape2)
library(knitr)
library(dplyr) # for data manipulation
library(ggplot2) # for graphing
library(extrafont) # for main_theme
library(leaps) #for modeling
library(gam)
source("https://raw.githubusercontent.com/andkov/psy532/master/scripts/graphs/main_theme.R")
Find file path for data
Set file path for data
# ds <- Hmisc::spss.get(filePath, use.value.labels = TRUE)
# saveRDS(ds, "./data/hrs/hrs_retirement.rds")
ds.prime<- readRDS("./data/raw/hrs_retirement.rds")
# ds <- readRDS("https://github.com/andkov/psy532/raw/master/data/hrs/hrs_retirement.rds")
# as.data.frame(table(ds.prime$wave)) #for wave selection, to see which group had the largest n
Examine data and select wave (describes what portion of sample…) - done because of file size
names(ds.prime)
[1] "hhidpn" "wave" "hacohort" "ragender" "rahispan" "raracem" "rabmonth" "rabyear" "radyear" "raedyrs"
[11] "wtresp" "rahrsamp" "shlt" "shltc" "hltc" "cesd" "bmi" "smokev" "smoken" "drink"
[21] "hibp" "diab" "cancr" "lung" "heart" "strok" "psych" "arthr" "conde" "r1adlww"
[31] "r1iadlww" "adla" "r1adlw" "iadla" "r2adlr10" "mstot2a" "inpova" "sayret" "retemp" "lbrf"
[41] "jcocc" "inlbrf" "iyear" "retdat" "agey" "cogtot" "mstot" "vigact" "numb" "biyear"
[51] "bagey" "blbrf" "bcogtot" "retire" "oret" "tret" "wavey" "cogsc" "cogslope"
ds <- ds.prime[ds.prime$wave==10, ] # keep only wave 10, to reduce ds size and make more managable (choose other value?)
distinct(dplyr::select(ds, hhidpn)) %>% count()
Source: local data frame [1 x 1]
n
(int)
1 20337
# length(unique(ds[ , "hhidpn"]))
# psych::describe(ds)
# head(ds)
columnnames <- read.csv("./data/raw/hrs_column_names.csv",sep="\t")
# View(columnnames)
# @kntir basic_table ---------------------------------------
#Variables of interest - BMI as Y, Potential X's are -- rabyear, ragender, wtresp, raedyrs, shlt, cesd, smoken, drink, hibp, inpova, agey, cogtot, mstot, vigact, tret, conde, hacort (?)
# pairs(ds$bmi ~ ds$wtresp + ds$agey + ds$conde + ds$cogtot + ds$mstot + ds$raedyrs + ds$cesd + ds$shlt + ds$ragender + ds$raracem + ds$tret) #may need to change
#Note: that conde, cogtot, mstot, wtresp, tret, cesd, shlt and ragender may be linear
#Var's that appear to be related to BMI - hard to tell, most of them do seem related, no reason to exclude
#Plots of these variables - note move this after model selection stuff? May want to...
summary(ds) #pretty good, see if you can clean it up...
hhidpn wave hacohort ragender rahispan
Min. : 3010 Min. :10 0.hrs/ahead ovrlap : 38 1.male : 8791 0. not hispanic:17677
1st Qu.: 56274040 1st Qu.:10 1.ahead :1253 2.female:11546 1. hispanic : 2636
Median :202628020 Median :10 2.coda :1178 NA's : 24
Mean :306713930 Mean :10 3.hrs :7695
3rd Qu.:521444010 3rd Qu.:10 4.warbabies :2068
Max. :959738010 Max. :10 5.early babyboomers:3979
6.mid babyboomers :4126
raracem rabmonth rabyear radyear raedyrs wtresp
1.white/caucasian :14777 Min. : 1.000 Min. :1901 Min. :2010 Min. : 0.00 Min. : 437
2.black/african american: 3908 1st Qu.: 4.000 1st Qu.:1935 1st Qu.:2011 1st Qu.:12.00 1st Qu.: 2040
3.other : 1604 Median : 7.000 Median :1944 Median :2011 Median :12.00 Median : 3670
NA's : 48 Mean : 6.594 Mean :1943 Mean :2011 Mean :12.61 Mean : 4663
3rd Qu.:10.000 3rd Qu.:1953 3rd Qu.:2012 3rd Qu.:15.00 3rd Qu.: 5887
Max. :12.000 Max. :1959 Max. :2013 Max. :18.00 Max. :19408
NA's :19526 NA's :93
rahrsamp shlt shltc hltc cesd
0.not in sample :14814 1. excellent:1998 Min. :-4.000 1. much better : 0 Min. :0.000
1.in samp,hrs92 resp b.1931-41: 5523 2. very good:5989 1st Qu.: 0.000 2. somewhat better:1472 1st Qu.:0.000
3. good :6423 Median : 0.000 3. same :9301 Median :1.000
4. fair :4272 Mean : 0.024 4. somewhat worse :4068 Mean :1.511
5. poor :1646 3rd Qu.: 0.000 5. much worse : 0 3rd Qu.:2.000
NA's : 9 Max. : 4.000 NA's :5496 Max. :8.000
NA's :5482 NA's :1010
bmi smokev smoken drink hibp
Min. : 7.00 0. no : 8773 0.no :17173 0.no : 8936 1. yes :12156
1st Qu.:24.40 1. yes:11480 1.yes: 3077 1.yes:11399 0. no : 8151
Median :27.50 NA's : 84 NA's : 87 NA's : 2 2. tia/possible stroke : 0
Mean :28.51 3. disp prev record and has cond: 0
3rd Qu.:31.60 4. disp prev record and no cond : 0
Max. :79.10 (Other) : 0
NA's :379 NA's : 30
diab cancr
0. no :15598 0. no :17429
1. yes : 4725 1. yes : 2890
2. tia/possible stroke : 0 2. tia/possible stroke : 0
3. disp prev record and has cond: 0 3. disp prev record and has cond: 0
4. disp prev record and no cond : 0 4. disp prev record and no cond : 0
(Other) : 0 (Other) : 0
NA's : 14 NA's : 18
lung heart
0. no :18247 0. no :15521
1. yes : 2076 1. yes : 4799
2. tia/possible stroke : 0 2. tia/possible stroke : 0
3. disp prev record and has cond: 0 3. disp prev record and has cond: 0
4. disp prev record and no cond : 0 4. disp prev record and no cond : 0
(Other) : 0 (Other) : 0
NA's : 14 NA's : 17
strok psych
0. no :18865 0. no :16710
1. yes : 1346 1. yes : 3599
2. tia/possible stroke : 115 2. tia/possible stroke : 0
3. disp prev record and has cond: 0 3. disp prev record and has cond: 0
4. disp prev record and no cond : 0 4. disp prev record and no cond : 0
(Other) : 0 (Other) : 0
NA's : 11 NA's : 28
arthr conde r1adlww r1iadlww adla
1. yes :11433 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.0000
0. no : 8874 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000
2. tia/possible stroke : 0 Median :2.000 Median :0.000 Median :0.000 Median :0.0000
3. disp prev record and has cond: 0 Mean :2.109 Mean :0.045 Mean :0.497 Mean :0.3537
4. disp prev record and no cond : 0 3rd Qu.:3.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.0000
(Other) : 0 Max. :8.000 Max. :3.000 Max. :3.000 Max. :5.0000
NA's : 30 NA's :3 NA's :13243 NA's :13287 NA's :111
r1adlw iadla r2adlr10 mstot2a inpova
Min. :0.000 Min. :0.0000 Min. : 0.000 Min. : 2.00 0.hh inc above pov thresh:17747
1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 3.000 1st Qu.:12.00 1.hh inc below pov thresh: 2590
Median :0.000 Median :0.0000 Median : 4.000 Median :14.00
Mean :0.118 Mean :0.1587 Mean : 4.302 Mean :12.99
3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.: 6.000 3rd Qu.:15.00
Max. :5.000 Max. :3.0000 Max. :10.000 Max. :15.00
NA's :13242 NA's :119 NA's :19168 NA's :19168
sayret retemp lbrf
0.not ret :7208 0.no retire empstat :11283 1.works ft :5339
1.completely retired :9055 1.only retire empstat : 7498 2.works pt :1195
2.partly retired :2426 2.retire plus other empstat: 1384 3.unemployed : 798
3.question irrelevant: 926 NA's : 172 4.partly ret :1486
NA's : 722 5.retired :9902
6.disabled : 540
7.not in lbrf:1077
jcocc inlbrf iyear retdat agey cogtot
02.prof specialty opr/tech sup: 589 0:no :11349 Min. :2010 Min. :1943 Min. : 50.33 Min. : 0.00
04.clerical/admin supp : 435 1:yes: 8818 1st Qu.:2011 1st Qu.:1995 1st Qu.: 57.25 1st Qu.:19.00
01.managerial specialty oper : 382 NA's : 170 Median :2011 Median :2002 Median : 65.75 Median :22.00
03.sales : 302 Mean :2011 Mean :2000 Mean : 66.90 Mean :21.55
09.personal svc : 180 3rd Qu.:2011 3rd Qu.:2007 3rd Qu.: 74.92 3rd Qu.:25.00
(Other) : 718 Max. :2012 Max. :2012 Max. :109.00 Max. :35.00
NA's :17731 NA's :11654 NA's :5370
mstot vigact numb biyear bagey blbrf bcogtot
Min. : 0.00 Min. : NA Min. : 1.000 Min. :1992 Min. :36.92 Min. :1.000 Min. : 4.00
1st Qu.:11.00 1st Qu.: NA 1st Qu.: 1.000 1st Qu.:1993 1st Qu.:52.92 1st Qu.:1.000 1st Qu.:20.00
Median :13.00 Median : NA Median : 7.000 Median :1999 Median :55.25 Median :1.000 Median :24.00
Mean :12.29 Mean :NaN Mean : 5.446 Mean :2001 Mean :57.54 Mean :2.771 Mean :23.19
3rd Qu.:14.00 3rd Qu.: NA 3rd Qu.: 9.000 3rd Qu.:2010 3rd Qu.:59.17 3rd Qu.:5.000 3rd Qu.:26.00
Max. :15.00 Max. : NA Max. :10.000 Max. :2012 Max. :93.50 Max. :7.000 Max. :35.00
NA's :5370 NA's :20337 NA's :219 NA's :9098
retire oret tret wavey cogsc cogslope
Min. :0.0000 Min. :1936 Min. :-74.661 Min. : 0.000 Min. : 1.823 Min. :-0.9870
1st Qu.:0.0000 1st Qu.:1995 1st Qu.:-15.747 1st Qu.: 0.000 1st Qu.:19.717 1st Qu.:-0.2908
Median :0.0000 Median :2002 Median : -8.911 Median :11.750 Median :22.249 Median :-0.2508
Mean :0.4405 Mean :2000 Mean :-10.621 Mean : 9.368 Mean :21.721 Mean :-0.2474
3rd Qu.:1.0000 3rd Qu.:2008 3rd Qu.: -3.081 3rd Qu.:17.667 3rd Qu.:24.290 3rd Qu.:-0.1961
Max. :1.0000 Max. :2013 Max. : 2.672 Max. :19.083 Max. :32.389 Max. : 0.3368
NA's :172 NA's :8114 NA's :8114 NA's :1858 NA's :2293
# dev.off() #use when invalid graphics
# plot1 <- ggplot2::ggplot(data=ds, aes(x=conde, y=bmi, color=shlt, shape=20)) + geom_point() +scale_shape_identity()+ geom_jitter()+ main_theme
#ds$bmi2 <- round(ds$bmi/5)*5 #due to having so many data points
# ds$conde2 <- round(ds$conde/5)*5
#May want to remove
plot1 <- ggplot2::ggplot(data=ds, aes(x=conde, y=bmi)) +
scale_shape_identity()+
# geom_jitter(color="forestgreen", shape=21, alpha=.5)+
geom_jitter(aes(color=smokev), shape=21, alpha=.5)+
# scale_color_manual(values=c("forestgreen"))+
main_theme +
# theme(legend.position="none") +
labs(color="Smoke")
plot1
#May want to keep
#Conde plot
plot1 <- ggplot2::ggplot(data=ds, aes(x=conde, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="forestgreen", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Number of Chronic Health Conditions"), ylab ="BMI")
plot1
#Age Plot
plot2 <- ggplot2::ggplot(data=ds, aes(x=agey, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="red", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Age (years)"), ylab ="BMI")
plot2
#Education Plot
plot3 <- ggplot2::ggplot(data=ds, aes(x=raedyrs, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="brown", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Education (years)"), ylab ="BMI")
plot3
#Cogtot Plot
plot4 <- ggplot2::ggplot(data=ds, aes(x=cogtot, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="blue", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Cognition"), ylab ="BMI")
plot4
#MsTot Plot
plot5 <- ggplot2::ggplot(data=ds, aes(x=mstot, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="orange", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Mental Status (score)"), ylab ="BMI")
plot5
#Depressive Symptoms
plot6 <- ggplot2::ggplot(data=ds, aes(x=cesd, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="purple", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Depressive Symptoms"), ylab ="BMI")
plot6
#Self-rated Health Score
plot7 <- ggplot2::ggplot(data=ds, aes(x=shlt, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="deeppink", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Self-rated Health Score"), ylab ="BMI")
plot7
#Change in Self-rated Health Score
plot12 <- ggplot2::ggplot(data=ds, aes(x=smokev, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="navyblue", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Change in Self-rated Health Score"), ylab ="BMI")
plot12
#Gender
plot8 <- ggplot2::ggplot(data=ds, aes(x=ragender, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="lightgreen", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Gender (1=male, 2=female)"), ylab ="BMI")
plot8
#Race
plot9 <- ggplot2::ggplot(data=ds, aes(x=raracem, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="black", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Race (1=White, 2=Black, 3=Other"), ylab ="BMI")
plot9
#Drink
plot10 <- ggplot2::ggplot(data=ds, aes(x=drink, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="cornflowerblue", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Ever Drink"), ylab ="BMI")
plot10
#Smoke
plot11 <- ggplot2::ggplot(data=ds, aes(x=smokev, y=bmi)) +
scale_shape_identity()+
geom_jitter(color="cyan2", shape=21, alpha=.5)+
main_theme +
theme(legend.position="none") +
labs(xlab("Ever Smoke"), ylab ="BMI")
plot11
Inspect data and examine relationships between data
# with $
a <- ds$bmi # extracts column "bmi" from dataset "ds0"
class(a)
[1] "numeric"
str(a)
num [1:20337] 24 25.7 21.3 34.4 29.3 ...
Create vector containing only variables of interest to BMI #These variables are
# Manually create the vector that contains the names of the variables you would like to keep.
selectVars <- c("hhidpn", "bmi", "conde", "agey", "cogtot", "mstot", "raedyrs", "cesd", "shlt", "ragender", "raracem", "smokev", "drink", "wtresp", "shltc")
ds2 <- ds[,selectVars]
ds3 <- mutate(ds2, shltnum = substring(shlt, 1, 1)) #change the \\D command
ds4 <- dplyr::mutate(ds3, gendernum = substring(ragender, 1, 1)) #decide if needed...
ds5 <- dplyr::mutate(ds4, racenum = substring(raracem, 1, 1))
ds6 <- dplyr::mutate(ds5, smokenum = substring(smokev, 1, 1))
ds7 <- dplyr::mutate(ds6, drinknum = substring(drink, 1, 1))
# Removes columns of redundant information - e.g. shlt -- turned into numbers rather than individual categories, find out if ok...
ds8 <- select(ds7, -shlt)
ds9 <- select(ds8, -ragender)
ds10 <- select(ds9, -raracem)
ds11 <- select(ds10, -smokev)
ds12 <- select(ds11, -drink)
Load graph settings for creating figures
# load themes used to style graphs
source("./scripts/graphs/graph_themes.R")
#table settings
Use GAM or Stepwise selection to select best model - Variables chosen are age in years, number of health conditions, eduction in years and self-rated health score
#variables used - bmi + rabyear + wtresp+ + shlt + cesd+ conde+ agey+ cogtot+ mstot+ vigact + smokev + drink + shltc
#full
regft.full <- regsubsets(ds12$bmi~., data = ds12, nvmax=18)
summary(regft.full)
Subset selection object
Call: regsubsets.formula(ds12$bmi ~ ., data = ds12, nvmax = 18)
18 Variables (and intercept)
Forced in Forced out
hhidpn FALSE FALSE
conde FALSE FALSE
agey FALSE FALSE
cogtot FALSE FALSE
mstot FALSE FALSE
raedyrs FALSE FALSE
cesd FALSE FALSE
wtresp FALSE FALSE
shltc FALSE FALSE
shltnum2 FALSE FALSE
shltnum3 FALSE FALSE
shltnum4 FALSE FALSE
shltnum5 FALSE FALSE
gendernum2 FALSE FALSE
racenum2 FALSE FALSE
racenum3 FALSE FALSE
smokenum1 FALSE FALSE
drinknum1 FALSE FALSE
1 subsets of each size up to 18
Selection Algorithm: exhaustive
hhidpn conde agey cogtot mstot raedyrs cesd wtresp shltc shltnum2 shltnum3 shltnum4 shltnum5 gendernum2
1 ( 1 ) " " " " "*" " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) " " "*" "*" " " " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) " " "*" "*" " " " " " " " " " " " " " " " " " " " " " "
4 ( 1 ) " " "*" "*" " " " " "*" " " " " " " " " " " " " " " " "
5 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " " " " " " "
6 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " " " " " " "
7 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " " " " " "*"
8 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " "*" " " "*"
9 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " "*" "*" " " "*"
10 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " "*" "*" "*" " " "*"
11 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " "*" "*" "*" "*" "*"
12 ( 1 ) " " "*" "*" "*" " " "*" " " " " "*" "*" "*" "*" "*" "*"
13 ( 1 ) " " "*" "*" "*" " " "*" " " " " "*" "*" "*" "*" "*" "*"
14 ( 1 ) " " "*" "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" "*"
15 ( 1 ) " " "*" "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" "*"
16 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" "*"
17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
racenum2 racenum3 smokenum1 drinknum1
1 ( 1 ) " " " " " " " "
2 ( 1 ) " " " " " " " "
3 ( 1 ) "*" " " " " " "
4 ( 1 ) "*" " " " " " "
5 ( 1 ) "*" " " " " " "
6 ( 1 ) "*" " " " " "*"
7 ( 1 ) "*" " " " " "*"
8 ( 1 ) "*" " " " " "*"
9 ( 1 ) "*" " " " " "*"
10 ( 1 ) "*" " " " " "*"
11 ( 1 ) "*" " " " " "*"
12 ( 1 ) "*" " " " " "*"
13 ( 1 ) "*" " " "*" "*"
14 ( 1 ) "*" " " "*" "*"
15 ( 1 ) "*" "*" "*" "*"
16 ( 1 ) "*" "*" "*" "*"
17 ( 1 ) "*" "*" "*" "*"
18 ( 1 ) "*" "*" "*" "*"
#Var's of interest - agey, conde,racenum2 (african-american), drinknum1 (yes), cogtot, raedyrs, shltnum4, smokenum1 (yes)
names(regft.full)
[1] "np" "nrbar" "d" "rbar" "thetab" "first" "last" "vorder" "tol"
[10] "rss" "bound" "nvmax" "ress" "ir" "nbest" "lopt" "il" "ier"
[19] "xnames" "method" "force.in" "force.out" "sserr" "intercept" "lindep" "nullrss" "nn"
[28] "call"
coef(regft.full,18)
(Intercept) hhidpn conde agey cogtot mstot raedyrs cesd
3.881552e+01 5.646507e-10 8.072686e-01 -1.895553e-01 7.727136e-02 3.974560e-02 -1.310662e-01 -1.657343e-02
wtresp shltc shltnum2 shltnum3 shltnum4 shltnum5 gendernum2 racenum2
-2.101632e-06 -2.685605e-01 9.923701e-01 1.354036e+00 1.710996e+00 1.421305e+00 -5.128229e-01 1.171970e+00
racenum3 smokenum1 drinknum1
-2.648775e-01 -2.680000e-01 -4.866865e-01
#coefficiants say...
regfull.sum <- summary(regft.full)
names(regfull.sum)
[1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
par(mfrow=c(2,2))
rsq <-regfull.sum$rsq
plot(rsq,col="purple",xlab= "Num of Vars",ylab="R-Squared", pch=19, type="l")
which.min(rsq)
[1] 1
points(1, rsq[1], pch=4, col="black", lwd=5)
rss <- regfull.sum$rss
plot(rss,col="purple",xlab= "Num of Vars",ylab="RSS" , pch=19)
lines(rss, col="red",lty=2, lwd=1)
which.min(rss)
[1] 18
points(18, rss[18], pch=4, col="black", lwd=5)
cp <- regfull.sum$cp
plot(cp,col="purple",xlab= "Num of Vars",ylab="Cp", pch=19 )
lines(cp, col="red",lty=2, lwd=1)
which.min(cp)
[1] 13
points(13, cp[13], pch=4, col="black", lwd=5)
bic <- regfull.sum$bic
plot(bic,col="purple",xlab= "Num of Vars",ylab="BIC" , pch=19)
lines(bic, col="red",lty=2,lwd=1)
which.min(bic)
[1] 12
points(12, bic[12], pch=4, col="black", lwd=5)
#12 or 13 variable model is what best subset says
#cross-validation -why doesn't it work
# set.seed(1)
# train <- sample(c(TRUE,FALSE), nrow(ds10), rep=TRUE)
# test <- (!train)
#
# regft.best <- regsubsets(bmi~.,data=ds10[train,],nvmax=12)
# test.mat <- model.matrix(bmi~.,data=ds10[test,])
#
# val.errors = rep(NA,12)
# for (i in 1:12){
# + coefi=coef(regft.best, id=i)
# + pred=test.mat[,names(coefi)]%*%coefi
# + val.error[i]=mean((ds10$bmi[test]-pred)^2)
# }
#
# #second try...
# pred.regsubsets = function(object,newdata,id,...){
# + form=as.formula(object$call[[2]])
# + mat=model.matrix(form,newdata)
# + coefi=coef(object,id=id)
# + xvars=names(coefi)
# + mat[,xvars]%*%coefi
# }
#
# regft.best <- regsubsets(bmi~.,data=ds10,nvmax=12)
# coef(regfit.best,10)
#
# k =10
# set.seed(1)
# folds = sample(1:k, nrow(ds10), replace=TRUE)
# cv.errors = matrix(NA,k,12,dimnames=list(NULL, paste(1:12)))
#
# for (j in 1:k){
# + best.fit = regsubsets(bmi~.,data=ds10[folds!=j,], nvmax=12)
# + for(i in 1:12){
# +pred = predict(best.fit,ds10[folds==j,],id=i)
# + cv.errors[j,i]=mean( (ds10$bmi[folds==j]-pred)^2)
# }
# }
#
# mean.cv.errors = apply(cv.errors,2,mean)
# mean.cv.errors
# kable(summary(out)$coef, digits=2) #to make table for models!!! can have summary and what not...all that jazz
#i.e. make summary tables
#look at univariate models for the significant variables - they are:
Create figures for modeling
# d <- ds2 %>% dplyr::select(hhidpn, bmi, agey, conde, raedyrs, raracem, drink, cogtot, ragender) #this needs to be updated...
# head(d)
# str(d$bmi) #what does this do...?
# summary(ds2$bmi) #may want to remove
# names(ds2)
#Omit Na's from data
ds12a <- na.omit(ds12)
ds7a <- na.omit(ds7)
#these could possibly be called from above?
#Conde (by race) Graph - ideal one...
plot.1 <- ggplot2::ggplot(data=ds7a, aes(x=conde, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(xlab="Number of chronic Health Conditions")+
guides(fill=guide_legend(title="Race"))
plot.1a <- plot.1 + facet_grid(. ~ raracem) #may want to take this out....
plot.1a
#age (by race) Graph
plot.2 <- ggplot2::ggplot(data=ds7a, aes(x=agey, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Race"))
plot.2
plot.2 + facet_grid(. ~ raracem) #may want to take this out....
#cogtot by race
plot.3 <- ggplot2::ggplot(data=ds7a, aes(x=cogtot, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Race"))
plot.3
plot.3 + facet_grid(. ~ raracem) #may want to take this out....
#education in years by race
plot.4 <- ggplot2::ggplot(data=ds7a, aes(x=raedyrs, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Race"))
plot.4
plot.4 + facet_grid(. ~ raracem) #may want to take this out....
#Conde (by drinks).
plot.5 <- ggplot2::ggplot(data=ds7a, aes(x=conde, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(xlab="Number of chronic Health Conditions")+
guides(fill=guide_legend(title="Drink"))
plot.5
plot.5 + facet_grid(. ~ drink) #may want to take this out....
#age by if drinks
plot.6 <- ggplot2::ggplot(data=ds7a, aes(x=agey, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Drink"))
plot.6
plot.6 + facet_grid(. ~ drink) #may want to take this out....
#cogtot by if drinks
plot.7 <- ggplot2::ggplot(data=ds7a, aes(x=cogtot, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Drink"))
plot.7
plot.7 + facet_grid(. ~ drink) #may want to take this out....
#education in years by if drinks
plot.8 <- ggplot2::ggplot(data=ds7a, aes(x=raedyrs, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Drink"))
plot.8
plot.8 + facet_grid(. ~ drink) #may want to take this out....
#Gender
#Conde (by gender) Graph - ideal one...
plot.9 <- ggplot2::ggplot(data=ds7a, aes(x=conde, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(xlab="Number of chronic Health Conditions")+
guides(fill=guide_legend(title="Gender"))
plot.9
plot.9 + facet_grid(. ~ ragender) #may want to take this out....
#age (by gender Graph
plot.10 <- ggplot2::ggplot(data=ds7a, aes(x=agey, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Gender"))
plot.10
plot.10 + facet_grid(. ~ ragender) #may want to take this out....
#cogtot by gender
plot.11 <- ggplot2::ggplot(data=ds7a, aes(x=cogtot, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Gender"))
plot.11
plot.11 + facet_grid(. ~ ragender) #may want to take this out....
#education in years by gender
plot.12 <- ggplot2::ggplot(data=ds7a, aes(x=raedyrs, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Gender"))
plot.12
plot.12 + facet_grid(. ~ ragender) #may want to take this out....
Creates prediction function for models
#Original Model from hrs starter, bmi not of interest - note:delete/modify
# See Chang (2013), section 5.7
# See psy532-Issue-15: https://github.com/andkov/psy532/issues/15
# Given a model, predict values of yvar from xvar
# This supports one predictor and one predicted variable
# xrange: If NULL, determine the x range from the model object. If a vector with
# two numbers, use those as the min and max of the prediction range.
# samples: Number of samples across the x range.
# ...: Further arguments to be passed to predict()
predictvals <- function(model, xvar, yvar, xrange=NULL, samples=100, ...) {
# If xrange isn't passed in, determine xrange from the models.
# Different ways of extracting the x range, depending on model type
if (is.null(xrange)) {
if (any(class(model) %in% c("lm", "glm")))
xrange <- range(model$model[[xvar]])
else if (any(class(model) %in% "loess")) #may need to change loess to GAM
xrange <- range(model$x)
}
newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model, newdata = newdata, ...)
newdata
}
Creates models of variables of interest
#to omit NA's from data! Ask Dr. Koval about this...var selection
#Conde
# set.seed(1)
# train=sample(14608,14608/2)
# fit1 <- lm(bmi~conde,data=ds11,subset=train)
# mean((ds11$bmi-predict(fit1,ds11))[-train]^2)
# fit2 <- lm(bmi~poly(conde,2),data=ds11,subset=train)
# mean((ds11$bmi-predict(fit2,ds11))[-train]^2) #lowest...
# fit3 <- lm(bmi~poly(conde,3),data=ds11,subset=train)
# mean((ds11$bmi-predict(fit3,ds11))[-train]^2)
# fit4 <- lm(bmi~poly(conde,4),data=ds11,subset=train)
# mean((ds11$bmi-predict(fit4,ds11))[-train]^2)
# fit5 <- lm(bmi~poly(conde,5),data=ds11,subset=train)
# mean((ds11$bmi-predict(fit5,ds11))[-train]^2)
#
fit.1 <- glm(bmi~conde,data=ds12a)
fit.2 <- lm(bmi~poly(conde,2),data=ds12a)
fit.3 <- lm(bmi~poly(conde,3),data=ds12a)
fit.4 <- lm(bmi~poly(conde,4),data=ds12a)
fit.5 <- lm(bmi~poly(conde,5),data=ds12a)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
Analysis of Deviance Table
Model: gaussian, link: identity
Response: bmi
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev
NULL 9416 294329
conde 1 12985 9415 281344
#USE AIC i guess..
# fit.1
# regsum1 <- summary(fit.1)
# aic1 <- regsum1$aic
#
# regsum2 <- summary(fit.2)
# aic2 <- regsum2$aic
#
# regsum3 <- summary(fit.3)
# aic3 <- regsum3$aic
#
# regsum4 <- summary(fit.4)
# aic4 <- regsum4$aic
#
# regsum5 <- summary(fit.5)
# aic5 <- regsum5$aic
#
# plot(1, aic1, col="purple",xlab= "Variable shape",ylab="AIC" ,xlim=c(1, 5), ylim=c(0, 100000), pch=20)
# points(2,aic2, col="purple",pch=20)
# points(3,aic3, col="purple",pch=20)
# points(4,aic4, col="purple",pch=20)
# points(5,aic5, col="purple",pch=20)
#seems like linear fit is the way to go for conde?
#Dr. Koval says use something like BIC...
#Rest of Var's...
#Age
# set.seed(1)
# train=sample(14608,7304)
# fit1 <- lm(bmi~agey,data=ds12a,subset=train)
# mean((ds12a$bmi-predict(fit1,ds12a))[-train]^2)
#
# fit2 <- lm(bmi~poly(agey,2),data=ds7a,subset=train)
# mean((ds7a$bmi-predict(fit2,ds7a))[-train]^2)
#
# fit3 <- lm(bmi~poly(agey,3),data=ds7a,subset=train)
# mean((ds7a$bmi-predict(fit3,ds7a))[-train]^2)
#
# fit4 <- lm(bmi~poly(agey,4),data=ds12a,subset=train)
# mean((ds12a$bmi-predict(fit4,ds12a))[-train]^2)
# fit5 <- lm(bmi~poly(agey,5),data=ds12a,subset=train)
# mean((ds12a$bmi-predict(fit5,ds12a))[-train]^2) #remove this stuff
fit.1 <- lm(bmi~agey,data=ds12a)
fit.2 <- lm(bmi~poly(agey,2),data=ds12a)
fit.3 <- lm(bmi~poly(agey,3),data=ds12a)
fit.4 <- lm(bmi~poly(agey,4),data=ds12a)
fit.5 <- lm(bmi~poly(agey,5),data=ds12a)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) #simplest way to explain...quadratic
Analysis of Variance Table
Model 1: bmi ~ agey
Model 2: bmi ~ poly(agey, 2)
Model 3: bmi ~ poly(agey, 3)
Model 4: bmi ~ poly(agey, 4)
Model 5: bmi ~ poly(agey, 5)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 9415 278859
2 9414 278825 1 34.151 1.1529 0.2830
3 9413 278800 1 24.797 0.8371 0.3602
4 9412 278797 1 2.895 0.0977 0.7546
5 9411 278761 1 36.072 1.2178 0.2698
#Raedyrs
fit.1 <- lm(bmi~raedyrs,data=ds12a)
fit.2 <- lm(bmi~poly(raedyrs,2),data=ds12a)
fit.3 <- lm(bmi~poly(raedyrs,3),data=ds12a)
fit.4 <- lm(bmi~poly(raedyrs,4),data=ds12a)
fit.5 <- lm(bmi~poly(raedyrs,5),data=ds12a)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) #quadratic...
Analysis of Variance Table
Model 1: bmi ~ raedyrs
Model 2: bmi ~ poly(raedyrs, 2)
Model 3: bmi ~ poly(raedyrs, 3)
Model 4: bmi ~ poly(raedyrs, 4)
Model 5: bmi ~ poly(raedyrs, 5)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 9415 292709
2 9414 292521 1 188.573 6.0691 0.01377 *
3 9413 292441 1 79.904 2.5717 0.10883
4 9412 292429 1 11.532 0.3711 0.54240
5 9411 292410 1 19.719 0.6347 0.42567
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Cogtot
fit.1 <- lm(bmi~cogtot,data=ds12a)
fit.2 <- lm(bmi~poly(cogtot,2),data=ds12a)
fit.3 <- lm(bmi~poly(cogtot,3),data=ds12a)
fit.4 <- lm(bmi~poly(cogtot,4),data=ds12a)
fit.5 <- lm(bmi~poly(cogtot,5),data=ds12a)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) #quadratic
Analysis of Variance Table
Model 1: bmi ~ cogtot
Model 2: bmi ~ poly(cogtot, 2)
Model 3: bmi ~ poly(cogtot, 3)
Model 4: bmi ~ poly(cogtot, 4)
Model 5: bmi ~ poly(cogtot, 5)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 9415 293276
2 9414 292787 1 489.14 15.7247 7.38e-05 ***
3 9413 292746 1 41.22 1.3250 0.2497
4 9412 292744 1 2.40 0.0771 0.7812
5 9411 292743 1 1.00 0.0321 0.8579
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#For each variable --
#decide if adding race and gender, might want to
#Conde...
m_1c <- glm(formula= bmi ~ conde , data = ds7a )
p_1c <- predictvals(m_1c, "conde", "bmi")
#Age
m_2a <- glm(formula= bmi ~ agey + I(agey^2) , data = ds7a )
p_2a <- predictvals(m_2a, "agey", "bmi")
#Education
m_2e <- glm(formula= bmi ~ raedyrs + I(raedyrs^2), data = ds7a )
p_2e <- predictvals(m_2e, "raedyrs", "bmi")
#Cognitive Total
m_2ct <- glm(formula= bmi ~ cogtot + I(cogtot^2), data = ds7a )
p_2ct <- predictvals(m_2ct, "cogtot", "bmi")
Adds models to figures produced above
# attach a line to the graph produced in the previous chunk
#add CI/jitter to lines -- stat.smooth stuff I believe
#after visual examination - cubic chosen for number of health conditions (conde) and for agey (age of participant), linear chosen for education (in years)
#For Condition (by Race)
# model <- lm(bmi ~ conde + factor(raracem), data=ds7a)
# grid <- with(ds7a, expand.grid(
# conde = seq(min(conde), max(conde), length = 8),
# raracem = levels(factor(raracem))
# ))
# grid$bmi <- stats::predict(model, newdata=grid)
# err <- stats::predict(model, newdata=grid, se = TRUE)
# grid$ucl <- err$fit + 1.96 * err$se.fit
# grid$lcl <- err$fit - 1.96 * err$se.fit
# plot.1 + geom_smooth(aes(ymin = lcl, ymax = ucl), data=grid, stat="identity")
# plot.1 + geom_line(data=p_1c, colour="black", size=1.0)
# plot.1
plot.1ab <- plot.1 + stat_smooth(method="lm", formula= y ~ ns(x,1))
plot.1ab
#For Age (by Race)
# model2 <- lm(bmi ~ agey + factor(raracem), data=ds7a)
# grid2 <- with(ds7a, expand.grid(
# agey = seq(min(agey), max(agey), length = 120),
# raracem = levels(factor(raracem))
# ))
# grid2$bmi <- stats::predict(model2, newdata=grid2) #problem with code right here...
# err <- stats::predict(model2, newdata=grid2, se = TRUE)
# grid2$ucl <- err$fit + 1.96 * err$se.fit
# grid2$lcl <- err$fit - 1.96 * err$se.fit
# plot.2 + geom_smooth(aes(ymin = lcl, ymax = ucl), data=grid2, stat="identity")
# plot.2 + geom_line(data=p_2a, colour="black", size=1.0)
plot.2ab <- plot.2 + stat_smooth(method="lm", formula= y ~ ns(x,2))
plot.2ab
#For Education (by Race) - Not working...
# model3 <- lm(bmi ~ raedyrs + factor(raracem), data=ds7a)
# grid3 <- with(ds7a, expand.grid(
# raedyrs = seq(min(raedyrs), max(raedyrs), length = 20),
# raracem = levels(factor(raracem))
# ))
# grid3$bmi <- stats::predict(model3, newdata=grid3)
# err <- stats::predict(model3, newdata=grid3, se = TRUE)
# grid3$ucl <- err$fit + 1.96 * err$se.fit
# grid3$lcl <- err$fit - 1.96 * err$se.fit
# plot.3 + geom_smooth(aes(ymin = lcl, ymax = ucl), data=grid3, stat="identity")
plot.3ab <- plot.3 + stat_smooth(method="lm", formula= y ~ ns(x,2))
plot.3ab
#For Cognition (by Race) - Not working for some reason...
# model4 <- lm(bmi ~ cogtot + factor(raracem), data=ds7a)
# grid4 <- with(ds7a, expand.grid(
# cogtot = seq(min(cogtot), max(cogtot), length = 30),
# raracem = levels(factor(raracem))
# ))
# grid4$bmi <- stats::predict(model4, newdata=grid4)
# err <- stats::predict(model4, newdata=grid4, se = TRUE)
# grid4$ucl <- err$fit + 1.96 * err$se.fit
# grid4$lcl <- err$fit - 1.96 * err$se.fit
# plot.4 + geom_smooth(aes(ymin = lcl, ymax = ucl), data=grid4, stat="identity")
plot.4ab <- plot.4 + stat_smooth(method="lm", formula= y ~ ns(x,2))
plot.4ab
Data seem show that the more health conditions you have, the higher your BMI, while people’s self assessment of their health corresponds to the amount of health conditions they have (i.e. they assess their health as being worse the more health conditions they have) ###
Data seem show that the older you are, the lower your BMI is, no change between the genders###
Data seem show that the more education you have, the lower your BMI is (only a slight relationship though, not super strong), while race seems to ###
Data seem show that the higher your cognition, the higher your body mass. a bit surprising…. The relationship with race seems to show ###
Appears that race is related to BMI, with blacks/african americans having the strongest relation
Removes all variables except variables conisdered for models
# Manually create the vector that contains the names of the variables you would like to keep.
selectVars <- c("hhidpn", "bmi", "conde", "agey", "raedyrs", "cogtot", "drink", "raracem", "ragender")
dsL <- ds7a[,selectVars]
head(dsL)
hhidpn bmi conde agey raedyrs cogtot drink raracem ragender
1 3010 24.0 1 74.66666 12 17 0.no 1.white/caucasian 1.male
2 3020 25.7 3 72.00000 16 25 0.no 1.white/caucasian 2.female
3 10001010 21.3 0 71.33334 12 26 0.no 1.white/caucasian 1.male
5 10004010 29.3 3 70.50000 16 27 1.yes 1.white/caucasian 1.male
7 10013010 32.2 3 72.08334 12 20 0.no 1.white/caucasian 1.male
9 10038010 23.9 3 74.33334 16 29 1.yes 1.white/caucasian 1.male
Creates an rds and a csv file that contains only the variables of interest for models (reduces data set size)
#Creates derived data which contains only variables of interest for the 3 models
pathdsLhrscsv <- "./data/derived/hrs_drv_dsL.csv"
write.csv(dsL,pathdsLhrscsv, row.names=FALSE)
pathdsLhrsrds <- "./data/derived/hrs_drv_dsL.rds"
saveRDS(object=dsL, file=pathdsLhrsrds, compress="xz")
# # remove all but specified dataset
#rm(list=setdiff(ls(), c("ds8","dsL")))